% Make some variable descriptions
cirrhosis_desc = {'age','albumin','alkaline phosphatase','ascites','serum bilirubin','serum cholesterol', ...
'edema treatment','hepatomegaly','platelets','prothrombin time','sex','aspartate aminotransferase','spiders',...
'stage','censoring','treatment','trigycerides','urine copper','log albumin','log serum bilirubin',...
'log prothrombin time','time'};
% Label the Categorial Variables
categVar = {'ascites','edema','hepato','sex','spiders','stage','trt'};
% Subset key variables w/o log
subset = {'age','albumin','bili','alkphos','chol','ast','protime',...
'platelet','spiders','hepato','ascites','edema'};
% Subset key variables w/ generated log features
subset_log = {'age','albumin','bili','alkphos','chol','ast','protime','platelet',...
'spiders','hepato','ascites','edema','logalb','logbil','logpro'};
cirrhosis.Properties.VariableDescriptions = cirrhosis_desc;
cirrhosis_names = cirrhosis.Properties.VariableNames;
head(cirrhosis)
ans = 8×22 table
| | age | albumin | alkphos | ascites | bili | chol | edema | hepato | platelet | protime | sex | ast | spiders | stage | status | trt | trig | copper | logalb | logbil | logpro | time |
|---|
| 1 | 58.7652 | 2.6000 | 1718 | 1 | 14.5000 | 261 | 1.0000 | 1 | 190 | 12.2000 | 1 | 137.9500 | 1 | 4 | 2 | 1 | 172 | 156 | 0.9555 | 2.6741 | 2.5014 | 400 |
|---|
| 2 | 56.4463 | 4.1400 | 7.3948e+03 | 0 | 1.1000 | 302 | 0 | 1 | 221 | 10.6000 | 1 | 113.5200 | 1 | 3 | 0 | 1 | 88 | 54 | 1.4207 | 0.0953 | 2.3609 | 4500 |
|---|
| 3 | 70.0726 | 3.4800 | 516 | 0 | 1.4000 | 176 | 0.5000 | 0 | 151 | 12.0000 | 0 | 96.1000 | 0 | 4 | 2 | 1 | 55 | 210 | 1.2470 | 0.3365 | 2.4849 | 1012 |
|---|
| 4 | 54.7406 | 2.5400 | 6.1218e+03 | 0 | 1.8000 | 244 | 0.5000 | 1 | 183 | 10.3000 | 1 | 60.6300 | 1 | 4 | 2 | 1 | 92 | 64 | 0.9322 | 0.5878 | 2.3321 | 1925 |
|---|
| 5 | 38.1054 | 3.5300 | 671 | 0 | 3.4000 | 279 | 0 | 1 | 136 | 10.9000 | 1 | 113.1500 | 1 | 3 | 1 | 2 | 72 | 143 | 1.2613 | 1.2238 | 2.3888 | 1504 |
|---|
| 6 | 66.2587 | 3.9800 | 944 | 0 | 0.8000 | 248 | 0 | 1 | NaN | 11.0000 | 1 | 93.0000 | 0 | 3 | 2 | 2 | 63 | 50 | 1.3813 | -0.2231 | 2.3979 | 2503 |
|---|
| 7 | 55.5346 | 4.0900 | 824 | 0 | 1.0000 | 322 | 0 | 1 | 204 | 9.7000 | 1 | 60.4500 | 0 | 3 | 0 | 2 | 213 | 52 | 1.4085 | 0 | 2.2721 | 1832 |
|---|
| 8 | 53.0568 | 4.0000 | 4.6512e+03 | 0 | 0.3000 | 280 | 0 | 0 | 373 | 11.0000 | 1 | 28.3800 | 0 | 3 | 2 | 2 | 189 | 52 | 1.3863 | -1.2040 | 2.3979 | 2466 |
|---|
cirr_summary = summary(cirrhosis);
N = numel(cirrhosis.Properties.VariableNames);
temp_table = struct('Min',zeros(N,1),'Max',zeros(N,1),'Range',zeros(N,1),...
'Median',zeros(N,1),'Mean',zeros(N,1),'NumMissing',zeros(N,1));
temp_struct = cirr_summary.(cirrhosis_names{i});
temp_table.Min(i) = temp_struct.Min;
temp_table.Max(i) = temp_struct.Max;
temp_table.Mean(i) = mean(cirrhosis.(cirrhosis_names{i}));
temp_table.Median(i) = temp_struct.Median;
temp_table.Range(i) = temp_struct.Max - temp_struct.Min;
temp_table.NumMissing(i) = temp_struct.NumMissing;
sumTable = table(cirrhosis_names', ...
temp_table.NumMissing, ...
'VariableNames',{'VarName','VarDesc','Min','Max','Mean','Median','Range','NumMissing'})
sumTable = 22×8 table
| | VarName | VarDesc | Min | Max | Mean | Median | Range | NumMissing |
|---|
| 1 | 'age' | 'age' | 26.2779 | 78.4394 | 50.0190 | 49.7947 | 52.1615 | 0 |
|---|
| 2 | 'albumin' | 'albumin' | 1.9600 | 4.6400 | 3.5200 | 3.5500 | 2.6800 | 0 |
|---|
| 3 | 'alkphos' | 'alkaline phosphatase' | 289.0000 | 1.3862e+04 | 1.9827e+03 | 1259 | 1.3573e+04 | 0 |
|---|
| 4 | 'ascites' | 'ascites' | 0 | 1 | 0.0769 | 0 | 1 | 0 |
|---|
| 5 | 'bili' | 'serum bilirubin' | 0.3000 | 28 | 3.2561 | 1.3500 | 27.7000 | 0 |
|---|
| 6 | 'chol' | 'serum cholesterol' | 120.0000 | 1775 | NaN | 309.5000 | 1655 | 28 |
|---|
| 7 | 'edema' | 'edema treatment' | 0 | 1 | 0.1106 | 0 | 1 | 0 |
|---|
| 8 | 'hepato' | 'hepatomegaly' | 0 | 1 | 0.5128 | 1 | 1 | 0 |
|---|
| 9 | 'platelet' | 'platelets' | 62.0000 | 563 | NaN | 257 | 501 | 4 |
|---|
| 10 | 'protime' | 'prothrombin time' | 9.0000 | 17.1000 | 10.7256 | 10.6000 | 8.1000 | 0 |
|---|
| 11 | 'sex' | 'sex' | 0 | 1 | 0.8846 | 1 | 1 | 0 |
|---|
| 12 | 'ast' | 'aspartate aminotransferase' | 26.3500 | 457.2500 | 122.5563 | 114.7000 | 430.9000 | 0 |
|---|
| 13 | 'spiders' | 'spiders' | 0 | 1 | 0.2885 | 0 | 1 | 0 |
|---|
| 14 | 'stage' | 'stage' | 1.0000 | 4 | 3.0321 | 3 | 3 | 0 |
|---|
| 15 | 'status' | 'censoring' | 0 | 2 | 0.8622 | 0 | 2 | 0 |
|---|
| 16 | 'trt' | 'treatment' | 1.0000 | 2 | 1.4936 | 1 | 1 | 0 |
|---|
| 17 | 'trig' | 'trigycerides' | 33.0000 | 598 | NaN | 108 | 565 | 30 |
|---|
| 18 | 'copper' | 'urine copper' | 4.0000 | 588 | NaN | 73 | 584 | 2 |
|---|
| 19 | 'logalb' | 'log albumin' | 0.6729 | 1.5347 | 1.2508 | 1.2669 | 0.8618 | 0 |
|---|
| 20 | 'logbil' | 'log serum bilirubin' | -1.2040 | 3.3322 | 0.5757 | 0.2994 | 4.5362 | 0 |
|---|
| 21 | 'logpro' | 'log prothrombin time' | 2.1972 | 2.8391 | 2.3686 | 2.3609 | 0.6419 | 0 |
|---|
| 22 | 'time' | 'time' | 41.0000 | 4556 | 2.0064e+03 | 1.8395e+03 | 4515 | 0 |
|---|
% Risk score computation as described in Cirrhosis paper
mayoFinalR = 0.871*cirrhosis.logbil ...
- 2.53*cirrhosis.logalb ...
+ 0.039*cirrhosis.age ...
+ 2.38*cirrhosis.logpro ...
% Scatterplot subset by risk high risk -> decreased survivability
set(f, 'Position', expandFigSize)
for i=1:size(cirrhosis,2)
boxplot(cirrhosis.(i),mayoFinalR >= median(mayoFinalR),'Labels',{'Low Risk','High Risk'})
title(cirrhosis_names{i})
sgtitle("Feature Boxplot Grouped by Risk Type")
[R_rank,P_rank] = corr(table2array(cirrhosis),'Type','Spearman','rows','complete');
r_rank_map = heatmap(R_rank);
title('Pairwise Rank Correlations')
r_rank_map.XDisplayLabels = cirrhosis_names;
r_rank_map.YDisplayLabels = cirrhosis_names;
% Features sorted by Correlation
absR_rank_time = abs(R_rank(1:end-1,end));
[absR_rank_time,I] = sort(absR_rank_time,'descend');
P_rank_time = P_rank(1:end-1,end);
cirrhosis_names_table = cirrhosis_names(1:end-1);
cirrhosis_names_table = cirrhosis_names_table(I);
rankTable = table(cirrhosis_names_table',absR_rank_time,P_rank_time(I),...
'VariableNames',{'Feature','Correlation','pValue'})
rankTable = 21×3 table
| | Feature | Correlation | pValue |
|---|
| 1 | 'bili' | 0.4982 | 0.0000 |
|---|
| 2 | 'logbil' | 0.4982 | 0.0000 |
|---|
| 3 | 'copper' | 0.4359 | 0.0000 |
|---|
| 4 | 'status' | 0.4059 | 0.0000 |
|---|
| 5 | 'albumin' | 0.3910 | 0.0000 |
|---|
| 6 | 'logalb' | 0.3910 | 0.0000 |
|---|
| 7 | 'stage' | 0.3716 | 0.0000 |
|---|
| 8 | 'edema' | 0.3375 | 0.0000 |
|---|
| 9 | 'ascites' | 0.3337 | 0.0000 |
|---|
| 10 | 'hepato' | 0.2666 | 0.0000 |
|---|
| 11 | 'spiders' | 0.2567 | 0.0000 |
|---|
| 12 | 'ast' | 0.2504 | 0.0000 |
|---|
| 13 | 'trig' | 0.1916 | 0.0014 |
|---|
| 14 | 'platelet' | 0.1622 | 0.0069 |
|---|
| 15 | 'age' | 0.1432 | 0.0173 |
|---|
| 16 | 'protime' | 0.1366 | 0.0232 |
|---|
| 17 | 'logpro' | 0.1366 | 0.0232 |
|---|
| 18 | 'chol' | 0.0868 | 0.1504 |
|---|
| 19 | 'alkphos' | 0.0626 | 0.2999 |
|---|
| 20 | 'sex' | 0.0560 | 0.3537 |
|---|
| 21 | 'trt' | 0.0133 | 0.8262 |
|---|
[R_pear,P_pear] = corr(table2array(cirrhosis),'Type','Pearson','rows','complete');
r_pear_map = heatmap(R_pear);
title('Pairwise Pearson Correlations')
r_pear_map.XDisplayLabels = cirrhosis_names;
r_pear_map.YDisplayLabels = cirrhosis_names;
% Features sorted by Correlation
absR_pear_time = abs(R_pear(1:end-1,end));
[absR_pear_time,I] = sort(absR_pear_time,'descend');
P_rank_time = P_pear(1:end-1,end);
cirrhosis_names_table = cirrhosis_names(1:end-1);
cirrhosis_names_table = cirrhosis_names_table(I);
pearsonTable = table(cirrhosis_names_table',absR_pear_time,P_rank_time(I),...
'VariableNames',{'Feature','Correlation','pValue'})
pearsonTable = 21×3 table
| | Feature | Correlation | pValue |
|---|
| 1 | 'logbil' | 0.4882 | 0.0000 |
|---|
| 2 | 'bili' | 0.4303 | 0.0000 |
|---|
| 3 | 'logalb' | 0.4026 | 0.0000 |
|---|
| 4 | 'albumin' | 0.4019 | 0.0000 |
|---|
| 5 | 'status' | 0.3845 | 0.0000 |
|---|
| 6 | 'stage' | 0.3639 | 0.0000 |
|---|
| 7 | 'copper' | 0.3614 | 0.0000 |
|---|
| 8 | 'edema' | 0.3471 | 0.0000 |
|---|
| 9 | 'ascites' | 0.3294 | 0.0000 |
|---|
| 10 | 'hepato' | 0.2610 | 0.0000 |
|---|
| 11 | 'spiders' | 0.2606 | 0.0000 |
|---|
| 12 | 'ast' | 0.1911 | 0.0014 |
|---|
| 13 | 'trig' | 0.1639 | 0.0064 |
|---|
| 14 | 'platelet' | 0.1591 | 0.0081 |
|---|
| 15 | 'age' | 0.1432 | 0.0173 |
|---|
| 16 | 'chol' | 0.1367 | 0.0231 |
|---|
| 17 | 'protime' | 0.1284 | 0.0330 |
|---|
| 18 | 'logpro' | 0.1275 | 0.0343 |
|---|
| 19 | 'alkphos' | 0.1044 | 0.0833 |
|---|
| 20 | 'sex' | 0.0294 | 0.6269 |
|---|
| 21 | 'trt' | 0.0193 | 0.7492 |
|---|
[R_pear_log,P_pear_log] = corr(table2array(cirrhosis_log),'Type','Pearson','rows','complete');
r_pear_log_map = heatmap(R_pear_log);
title('Pairwise Pearson Correlations (Log Time)')
r_pear_log_map.XDisplayLabels = cirrhosis_log_names;
r_pear_log_map.YDisplayLabels = cirrhosis_log_names;
absR_pear_log_time = abs(R_pear_log(1:end-1,end));
[absR_pear_log_time,I] = sort(absR_pear_log_time,'descend');
P_rank_time = P_pear_log(1:end-1,end);
cirrhosis_names_table = cirrhosis_log_names(1:end-1);
cirrhosis_names_table = cirrhosis_names_table(I);
pearsonLogTable = table(cirrhosis_names_table',absR_pear_log_time,P_rank_time(I),...
'VariableNames',{'Feature','Correlation','pValue'})
pearsonLogTable = 20×3 table
| | Feature | Correlation | pValue |
|---|
| 1 | 'edema' | 0.5402 | 0.0000 |
|---|
| 2 | 'logbil' | 0.5242 | 0.0000 |
|---|
| 3 | 'ascites' | 0.5133 | 0.0000 |
|---|
| 4 | 'bili' | 0.5025 | 0.0000 |
|---|
| 5 | 'logalb' | 0.4600 | 0.0000 |
|---|
| 6 | 'albumin' | 0.4471 | 0.0000 |
|---|
| 7 | 'copper' | 0.4019 | 0.0000 |
|---|
| 8 | 'stage' | 0.3815 | 0.0000 |
|---|
| 9 | 'logpro' | 0.3164 | 0.0000 |
|---|
| 10 | 'protime' | 0.3150 | 0.0000 |
|---|
| 11 | 'spiders' | 0.2934 | 0.0000 |
|---|
| 12 | 'ast' | 0.2402 | 0.0001 |
|---|
| 13 | 'platelet' | 0.2279 | 0.0001 |
|---|
| 14 | 'hepato' | 0.2221 | 0.0002 |
|---|
| 15 | 'age' | 0.2204 | 0.0002 |
|---|
| 16 | 'trig' | 0.1642 | 0.0063 |
|---|
| 17 | 'alkphos' | 0.0486 | 0.4214 |
|---|
| 18 | 'chol' | 0.0431 | 0.4759 |
|---|
| 19 | 'sex' | 0.0384 | 0.5249 |
|---|
| 20 | 'trt' | 0.0095 | 0.8748 |
|---|
% Remove status and try to find pca that detemines best features to predict status
cirrhosis_status_rm = cirrhosis_status;
cirrhosis_status_rm.status = [];
[coef, score, ~, ~, explained] = pca(zscore(table2array(cirrhosis_status_rm)));
% [coef, score, ~, ~, explained] = pca(table2array(cirrhosis_status_rm)); %Use this instead if you are working zscore data
% %Variance explained by PC1 and PC2
% Plot PCA and color by status
gscatter(score(:,1), score(:,2), cirrhosis_status.status)
xlabel('Principal Component 1')
ylabel('Principal Component 2')
title('PCA of Cirrhosis Patient data')
% Table of PCA Coefficients
T = table(cirrhosis_status_rm.Properties.VariableNames', PC1_coeff, PC2_coeff)
T = 20×3 table
| | Var1 | PC1_coeff | PC2_coeff |
|---|
| 1 | 'age' | 0.1256 | -0.2995 |
|---|
| 2 | 'albumin' | -0.2856 | 0.1006 |
|---|
| 3 | 'alkphos' | 0.0900 | 0.1759 |
|---|
| 4 | 'ascites' | 0.2788 | -0.1801 |
|---|
| 5 | 'bili' | 0.3298 | 0.2530 |
|---|
| 6 | 'chol' | 0.0963 | 0.4533 |
|---|
| 7 | 'edema' | 0.2865 | -0.2080 |
|---|
| 8 | 'hepato' | 0.2116 | 0.0657 |
|---|
| 9 | 'platelet' | -0.1308 | 0.2553 |
|---|
| 10 | 'protime' | 0.2544 | -0.2676 |
|---|
| 11 | 'sex' | -0.0329 | 0.0521 |
|---|
| 12 | 'ast' | 0.1802 | 0.3177 |
|---|
| 13 | 'spiders' | 0.1987 | -0.0039 |
|---|
| 14 | 'stage' | 0.2399 | -0.0663 |
|---|
| 15 | 'trt' | 0.0060 | 0.0729 |
|---|
| 16 | 'trig' | 0.1579 | 0.2934 |
|---|
| 17 | 'copper' | 0.2531 | 0.1503 |
|---|
| 18 | 'logalb' | -0.2895 | 0.1048 |
|---|
| 19 | 'logbil' | 0.3459 | 0.2797 |
|---|
| 20 | 'logpro' | 0.2589 | -0.2677 |
|---|
% Table of PCA sorted by PC1
T_PC1 = sortrows(T,'PC1_coeff','descend')
T_PC1 = 20×3 table
| | Var1 | PC1_coeff | PC2_coeff |
|---|
| 1 | 'logbil' | 0.3459 | 0.2797 |
|---|
| 2 | 'bili' | 0.3298 | 0.2530 |
|---|
| 3 | 'edema' | 0.2865 | -0.2080 |
|---|
| 4 | 'ascites' | 0.2788 | -0.1801 |
|---|
| 5 | 'logpro' | 0.2589 | -0.2677 |
|---|
| 6 | 'protime' | 0.2544 | -0.2676 |
|---|
| 7 | 'copper' | 0.2531 | 0.1503 |
|---|
| 8 | 'stage' | 0.2399 | -0.0663 |
|---|
| 9 | 'hepato' | 0.2116 | 0.0657 |
|---|
| 10 | 'spiders' | 0.1987 | -0.0039 |
|---|
| 11 | 'ast' | 0.1802 | 0.3177 |
|---|
| 12 | 'trig' | 0.1579 | 0.2934 |
|---|
| 13 | 'age' | 0.1256 | -0.2995 |
|---|
| 14 | 'chol' | 0.0963 | 0.4533 |
|---|
| 15 | 'alkphos' | 0.0900 | 0.1759 |
|---|
| 16 | 'trt' | 0.0060 | 0.0729 |
|---|
| 17 | 'sex' | -0.0329 | 0.0521 |
|---|
| 18 | 'platelet' | -0.1308 | 0.2553 |
|---|
| 19 | 'albumin' | -0.2856 | 0.1006 |
|---|
| 20 | 'logalb' | -0.2895 | 0.1048 |
|---|
% Table of PCA sorted by PC2
T_PC2 = sortrows(T,'PC2_coeff','descend')
T_PC2 = 20×3 table
| | Var1 | PC1_coeff | PC2_coeff |
|---|
| 1 | 'chol' | 0.0963 | 0.4533 |
|---|
| 2 | 'ast' | 0.1802 | 0.3177 |
|---|
| 3 | 'trig' | 0.1579 | 0.2934 |
|---|
| 4 | 'logbil' | 0.3459 | 0.2797 |
|---|
| 5 | 'platelet' | -0.1308 | 0.2553 |
|---|
| 6 | 'bili' | 0.3298 | 0.2530 |
|---|
| 7 | 'alkphos' | 0.0900 | 0.1759 |
|---|
| 8 | 'copper' | 0.2531 | 0.1503 |
|---|
| 9 | 'logalb' | -0.2895 | 0.1048 |
|---|
| 10 | 'albumin' | -0.2856 | 0.1006 |
|---|
| 11 | 'trt' | 0.0060 | 0.0729 |
|---|
| 12 | 'hepato' | 0.2116 | 0.0657 |
|---|
| 13 | 'sex' | -0.0329 | 0.0521 |
|---|
| 14 | 'spiders' | 0.1987 | -0.0039 |
|---|
| 15 | 'stage' | 0.2399 | -0.0663 |
|---|
| 16 | 'ascites' | 0.2788 | -0.1801 |
|---|
| 17 | 'edema' | 0.2865 | -0.2080 |
|---|
| 18 | 'protime' | 0.2544 | -0.2676 |
|---|
| 19 | 'logpro' | 0.2589 | -0.2677 |
|---|
| 20 | 'age' | 0.1256 | -0.2995 |
|---|
%Box plots exploring top 5 features recognized as having most weight from from PC1
%Features split on status
set(f, 'Position', expandFigSize)
sgtitle('Most important features identifed by PC1 split by survival status')
features = T_PC1.Var1(1:5);
features_analyzed = cell(5,2);
features_analyzed(:,1) = features;
boxplot(cirrhosis_status.(features{i}),cirrhosis_status.status, 'Labels',{'Survived','Died'})
[h,p] = ttest2(cirrhosis_status.(features{i})(cirrhosis_status.status == 0),cirrhosis_status.(features{i})(cirrhosis_status.status == 1));
features_analyzed{i,2} = p;
T_PC1_analyzed = cell2table(features_analyzed);
T_PC1_analyzed.Properties.VariableNames = {'Features','P values'}
T_PC1_analyzed = 5×2 table
| | Features | P values |
|---|
| 1 | 'logbil' | 7.4316e-18 |
|---|
| 2 | 'bili' | 3.5816e-13 |
|---|
| 3 | 'edema' | 1.5362e-08 |
|---|
| 4 | 'ascites' | 3.0548e-07 |
|---|
| 5 | 'logpro' | 3.8126e-12 |
|---|
idx = kmeans(table2array(cirrhosis_status),kValues(i)); % k-means clustering
s = silhouette(table2array(cirrhosis_status),idx); % silhouette values
s_score(i) = mean(s); % silhouette score
table(kValues',s_score)
ans = 9×2 table
| | Var1 | s_score |
|---|
| 1 | 2 | 0.9274 |
|---|
| 2 | 3 | 0.8810 |
|---|
| 3 | 4 | 0.7412 |
|---|
| 4 | 5 | 0.7236 |
|---|
| 5 | 6 | 0.6846 |
|---|
| 6 | 7 | 0.6478 |
|---|
| 7 | 8 | 0.6429 |
|---|
| 8 | 9 | 0.5772 |
|---|
| 9 | 10 | 0.5978 |
|---|
k = find(s_score==max(s_score))+1 % k = 2 is best
% Plot silhouette for optimal cluster size
idx = kmeans(table2array(cirrhosis_status),k);
silhouette(table2array(cirrhosis_status),idx)
title('Silhouette Plot k=2')
%Predicting status for each cluster from k-means
cluster1 = (cirrhosis_status.status(idx == 1));
cluster1_survived = sum((cluster1 == 0))
cluster1_died = sum((cluster1 == 1))
cluster2 = (cirrhosis_status.status(idx == 2));
cluster2_survived = sum((cluster2 == 0))
cluster2_died = sum((cluster2 == 1))
% Top 10 features for linear regression
[B,I] = maxk(abs(R_pear_log(end,1:end-1)),10);
linear_reg_names = [{'(Intercept)'},cirrhosis_log_names(I)];
% Generate crossvalidation groups
indices = crossvalind('kfold',size(cirrhosis_log,1),NFolds);
% Declare datasets for storing the outputs
regressionModel = struct();
subsets = {subset, subset_log};
% Extract train rows from X and Y
trainData = cirrhosis_log(train,:);
Xtrain = table2array(trainData(:,1:end-1));
Ytrain = table2array(trainData(:,end));
% Extract test rows from X and Y
testData = cirrhosis_log(test,:);
Xtest = table2array(testData(:,1:end-1));
Ytest = table2array(testData(:,end));
[lassoB,lassoFit] = lasso(Xtrain,Ytrain,'CV',NFolds);
coef0 = lassoFit.Intercept(lassoFit.IndexMinMSE);
coef = lassoB(:,lassoFit.IndexMinMSE);
Ypred = Xtest*coef + coef0;
[~,maxCoefIdx] = maxk(abs(coef),ncoef);
% Save fit for later and evaluate the model
regressionModel.lasso.fit{i} = lassoFit;
regressionModel.lasso.coeffVals{i} = coef(maxCoefIdx);
regressionModel.lasso.coeffNames{i} = cirrhosis_names(maxCoefIdx)';
regressionModel.lasso.ncoef(i) = ncoef;
regressionModel.lasso.absError(i) = mean(abs(Ypred-Ytest));
[regressionModel.lasso.corrR(i),regressionModel.lasso.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.lasso.rankCorrR(i),regressionModel.lasso.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
stepFit = stepwiselm(trainData,'Upper','linear','ResponseVar','time','CategoricalVars',categVar,'Verbose',0);
stepFit = stepwiselm(trainData,'Upper','linear','ResponseVar','time','PredictorVars',subsets{j-1},'Verbose',0);
Ypred = stepFit.predict(Xtest);
[~,coefIdx] = sort(stepFit.Coefficients.pValue);
% Save fit for later and evaluate the model
regressionModel.step(j).fit{i} = stepFit;
regressionModel.step(j).coeffNames{i} = stepFit.Coefficients.Row(coefIdx);
regressionModel.step(j).coeffVals{i} = stepFit.Coefficients.Estimate(coefIdx);
regressionModel.step(j).ncoef(i) = stepFit.NumPredictors;
regressionModel.step(j).rSquared(i) = stepFit.Rsquared.Ordinary;
regressionModel.step(j).rSquaredAdj(i) = stepFit.Rsquared.Adjusted;
regressionModel.step(j).absError(i) = mean(abs(Ypred-Ytest));
[regressionModel.step(j).corrR(i),regressionModel.step(j).corrP(i)] = corr(Ytest,Ypred);
[regressionModel.step(j).rankCorrR(i),regressionModel.step(j).rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
mdl = fitlm(Xtrain(:,I),Ytrain);
Ypred = mdl.predict(Xtest(:,I));
[~,coefIdx] = sort(mdl.Coefficients.pValue);
regressionModel.linear.fit{i} = mdl;
regressionModel.linear.features{i} = linear_reg_names;
regressionModel.linear.coeffNames{i} = linear_reg_names(coefIdx)';
regressionModel.linear.coeffVals{i} = mdl.Coefficients.Estimate(coefIdx);
regressionModel.linear.rSquared(i) = mdl.Rsquared.Ordinary;
regressionModel.linear.rSquaredAdj(i) = mdl.Rsquared.Adjusted;
regressionModel.linear.ncoef(i) = length(B);
regressionModel.linear.absError(i) = mean(abs(Ypred-Ytest));
[regressionModel.linear.corrR(i),regressionModel.linear.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.linear.rankCorrR(i),regressionModel.linear.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
rowNames = cellfun(@(s)sprintf('Subsample %d',s),num2cell(transpose(1:NFolds)),'UniformOutput',false);
varNames = ["ncoeff","Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val"];
table(regressionModel.lasso.ncoef',regressionModel.lasso.absError',regressionModel.lasso.corrR',...
regressionModel.lasso.corrP',regressionModel.lasso.rankCorrR',regressionModel.lasso.rankCorrP',...
'VariableNames', varNames,'RowNames', rowNames)
ans = 5×6 table
| | ncoeff | Average Absolute Error | Pearsons Correlation | Pearsons Correlation P-val | Rank Correlation | Rank Correlation P-val |
|---|
| 1 Subsample 1 | 9 | 0.4432 | 0.7036 | 1.4711e-09 | 0.5347 | 2.1819e-05 |
|---|
| 2 Subsample 2 | 8 | 0.5350 | 0.7467 | 5.9488e-11 | 0.6369 | 3.7350e-07 |
|---|
| 3 Subsample 3 | 10 | 0.3987 | 0.6856 | 7.6779e-09 | 0.6007 | 2.1068e-06 |
|---|
| 4 Subsample 4 | 16 | 0.5324 | 0.6055 | 9.6915e-07 | 0.5458 | 2.2436e-05 |
|---|
| 5 Subsample 5 | 14 | 0.4332 | 0.4938 | 1.2742e-04 | 0.4566 | 5.2559e-04 |
|---|
maxRow = max(regressionModel.lasso.ncoef);
tempT = table([regressionModel.lasso.coeffNames{i},num2cell(regressionModel.lasso.coeffVals{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T2Pad = [tempT;repmat({[{NaN},{NaN}]},maxRow-size(tempT,1),1)];
T
T = 16×5 table
| | Subset 1 | Subset 2 | Subset 3 | Subset 4 | Subset 5 |
|---|
| 1 | 'copper' | 1.0398 | 'edema' | -0.8454 | 'copper' | 0.8147 | 'copper' | 1.2322 | 'copper' | 0.8193 |
|---|
| 2 | 'ascites' | -0.5812 | 'copper' | 0.5156 | 'edema' | -0.6898 | 'ascites' | -0.4140 | 'edema' | -0.5482 |
|---|
| 3 | 'edema' | -0.4742 | 'ascites' | -0.4198 | 'ascites' | -0.5355 | 'edema' | -0.3415 | 'ascites' | -0.3432 |
|---|
| 4 | 'logalb' | -0.1906 | 'logalb' | -0.1689 | 'logalb' | -0.1439 | 'logalb' | -0.1007 | 'albumin' | 0.1643 |
|---|
| 5 | 'stage' | -0.0950 | 'stage' | -0.0834 | 'stage' | -0.0620 | 'stage' | -0.0899 | 'logalb' | -0.1306 |
|---|
| 6 | 'spiders' | -0.0741 | 'bili' | -6.2160e-04 | 'spiders' | -0.0194 | 'hepato' | 0.0874 | 'stage' | -0.0567 |
|---|
| 7 | 'bili' | -0.0018 | 'trig' | -3.7997e-04 | 'bili' | -0.0084 | 'spiders' | -0.0740 | 'spiders' | -0.0367 |
|---|
| 8 | 'trig' | -7.2035e-04 | 'alkphos' | 3.4316e-05 | 'age' | -0.0021 | 'status' | -0.0302 | 'bili' | -0.0274 |
|---|
| 9 | 'alkphos' | 3.5227e-05 | NaN | NaN | 'trig' | -7.9512e-04 | 'bili' | -0.0288 | 'status' | -0.0101 |
|---|
| 10 | NaN | NaN | NaN | NaN | 'alkphos' | 3.5382e-05 | 'protime' | -0.0181 | 'age' | -0.0017 |
|---|
| 11 | NaN | NaN | NaN | NaN | NaN | NaN | 'age' | -0.0060 | 'trig' | -9.7533e-04 |
|---|
| 12 | NaN | NaN | NaN | NaN | NaN | NaN | 'trig' | -0.0016 | 'platelet' | 3.7506e-04 |
|---|
| 13 | NaN | NaN | NaN | NaN | NaN | NaN | 'trt' | 7.9540e-04 | 'chol' | 1.9142e-04 |
|---|
| 14 | NaN | NaN | NaN | NaN | NaN | NaN | 'platelet' | 3.3887e-04 | 'alkphos' | 4.3230e-05 |
|---|
| 15 | NaN | NaN | NaN | NaN | NaN | NaN | 'ast' | -2.9267e-04 | NaN | NaN |
|---|
| 16 | NaN | NaN | NaN | NaN | NaN | NaN | 'alkphos' | 4.6125e-05 | NaN | NaN |
|---|
linVarNames = ["R-Squared","Adjusted R-Squared","ncoeff","Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val"];
table(regressionModel.linear.rSquared',regressionModel.linear.rSquaredAdj',regressionModel.linear.ncoef',...
regressionModel.linear.absError',regressionModel.linear.corrR',regressionModel.linear.corrP',...
regressionModel.linear.rankCorrR',regressionModel.linear.rankCorrP',...
'VariableNames', linVarNames,'RowNames', rowNames)
ans = 5×8 table
| | R-Squared | Adjusted R-Squared | ncoeff | Average Absolute Error | Pearsons Correlation | Pearsons Correlation P-val | Rank Correlation | Rank Correlation P-val |
|---|
| 1 Subsample 1 | 0.4958 | 0.4717 | 10 | 0.4621 | 0.7011 | 1.7786e-09 | 0.5230 | 0.0000 |
|---|
| 2 Subsample 2 | 0.4516 | 0.4255 | 10 | 0.5266 | 0.7483 | 5.1533e-11 | 0.6049 | 0.0000 |
|---|
| 3 Subsample 3 | 0.5043 | 0.4807 | 10 | 0.4358 | 0.5751 | 4.3764e-06 | 0.5465 | 0.0000 |
|---|
| 4 Subsample 4 | 0.5061 | 0.4826 | 10 | 0.5072 | 0.6412 | 1.3378e-07 | 0.5281 | 0.0000 |
|---|
| 5 Subsample 5 | 0.5341 | 0.5119 | 10 | 0.4403 | 0.4869 | 1.6364e-04 | 0.4272 | 0.0013 |
|---|
maxRow = max(regressionModel.linear.ncoef);
tempT = table([regressionModel.linear.coeffNames{i},num2cell(regressionModel.linear.coeffVals{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T2Pad = [tempT;repmat({[{NaN},{NaN}]},maxRow-size(tempT,1),1)];
T
T = 11×5 table
| | Subset 1 | Subset 2 | Subset 3 | Subset 4 | Subset 5 |
|---|
| 1 | 'stage' | -0.1503 | 'edema' | -0.9025 | 'edema' | -0.7777 | 'copper' | -0.0014 | 'edema' | -0.6937 |
|---|
| 2 | 'ascites' | -0.5725 | 'stage' | -0.1236 | 'logpro' | 18.0918 | 'stage' | -0.1230 | 'copper' | -0.0011 |
|---|
| 3 | 'edema' | -0.4901 | 'ascites' | -0.4427 | 'protime' | -1.6187 | 'edema' | -0.4783 | 'stage' | -0.1046 |
|---|
| 4 | 'logalb' | 9.2550 | 'logbil' | -0.1540 | 'ascites' | -0.4853 | 'ascites' | -0.3469 | 'ascites' | -0.3729 |
|---|
| 5 | 'logbil' | -0.1591 | 'logpro' | 6.6627 | 'logalb' | 5.8094 | 'protime' | -0.5744 | 'protime' | -0.7450 |
|---|
| 6 | 'albumin' | -2.3638 | 'protime' | -0.5577 | 'stage' | -0.1089 | 'logalb' | 3.8982 | 'logpro' | 8.3625 |
|---|
| 7 | 'copper' | -9.2122e-04 | 'copper' | -5.1500e-04 | '(Intercept)' | -19.5510 | 'bili' | -0.0236 | 'logbil' | -0.1177 |
|---|
| 8 | 'protime' | -0.5301 | 'logalb' | 2.5833 | 'copper' | -9.1400e-04 | 'logpro' | 6.1645 | 'bili' | -0.0224 |
|---|
| 9 | 'logpro' | 5.9515 | 'albumin' | -0.6051 | 'logbil' | -0.1387 | 'logbil' | -0.0908 | '(Intercept)' | -5.4262 |
|---|
| 10 | 'bili' | -0.0106 | '(Intercept)' | -2.8779 | 'albumin' | -1.4768 | 'albumin' | -0.7797 | 'albumin' | 0.2574 |
|---|
| 11 | '(Intercept)' | -3.5729 | 'bili' | -0.0045 | 'bili' | -0.0080 | '(Intercept)' | -2.4954 | 'logalb' | 0.6020 |
|---|
stepVarNames = ["R-Squared","Adjusted R-Squared","Number of Predictors","Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val"];
stepTables{i} = table(regressionModel.step(i).rSquared',regressionModel.step(i).rSquaredAdj',...
regressionModel.step(i).ncoef',regressionModel.step(i).absError',regressionModel.step(i).corrR',...
regressionModel.step(i).corrP',regressionModel.step(i).rankCorrR',regressionModel.step(i).rankCorrP',...
'VariableNames', stepVarNames,'RowNames', rowNames);
overSum.step(1,i) = mean(regressionModel.step(i).corrR);
overSum.step(2,i) = mean(regressionModel.step(i).rankCorrR);
overSum.step(3,i) = mean(regressionModel.step(i).absError);
overSum.step(4,i) = mean(regressionModel.step(i).ncoef);
overSum.linear = [mean(regressionModel.linear.corrR)
mean(regressionModel.linear.rankCorrR)
mean(regressionModel.linear.absError)
mean(regressionModel.linear.ncoef)];
overSum.lasso = [mean(regressionModel.lasso.corrR)
mean(regressionModel.lasso.rankCorrR)
mean(regressionModel.lasso.absError)
mean(regressionModel.lasso.ncoef)];
sumVarNames = ["Stepwise Full", "Stepwise Subset", "Stepwise Subset Log", "Linear FitLm", "Lasso"];
sumRowNames = ["Pearsons Correlation","Rank Correlation","Mean Absolute Error","Number of Coefficients"];
table(overSum.step(:,1),overSum.step(:,2),overSum.step(:,3),overSum.linear,overSum.lasso,...
'VariableNames', sumVarNames,'RowNames', sumRowNames)
ans = 4×5 table
| | Stepwise Full | Stepwise Subset | Stepwise Subset Log | Linear FitLm | Lasso |
|---|
| 1 Pearsons Correlation | 0.6467 | 0.6170 | 0.6588 | 0.6305 | 0.6470 |
|---|
| 2 Rank Correlation | 0.5632 | 0.5578 | 0.5627 | 0.5259 | 0.5549 |
|---|
| 3 Mean Absolute Error | 0.4638 | 0.4835 | 0.4637 | 0.4744 | 0.4685 |
|---|
| 4 Number of Coefficients | 6.0000 | 5.0000 | 5.2000 | 10.0000 | 11.4000 |
|---|
% Number of features to select
cirrhosis_status_array = table2array(cirrhosis_status);
accuracyGuess = zeros(100,NFolds);
meanGuess = zeros(NFolds,1);
sigDiff = zeros(NFolds,2);
% Extract train rows from X and Y
Xtrain = cirrhosis_status_array(train,1:end-1);
Ytrain = cirrhosis_status_array(train,end);
% Extract test rows from X and Y
Xtest = cirrhosis_status_array(test,1:end-1);
Ytest = cirrhosis_status_array(test,end);
% Create logistic model with train data.
[b,dev,stats] = mnrfit(Xtrain, categorical(Ytrain));
[p,I] = sort(stats.p(2:end,:));
[p_min,min_idx] = min(p,[],2);
regressionModel.logist.bestP{i} = p_min(1:NFeatures);
regressionModel.logist.bestFeatures{i} = diag(I(1:NFeatures,min_idx(1:NFeatures)));
regressionModel.logist.bestFeaturesNames{i} = cirrhosis_status_names(regressionModel.logist.bestFeatures{i})';
regressionModel.logist.bestFeaturesCoeff{i} = b(I(1:NFeatures));
% Predict Y values from model.
[~,Ypred] = max(Ypred,[],2);
Ypred = double(Ypred - 1);
%Comparing model to 100 random guesses
random_pred = Ytrain(randperm(length(Ytest)));
accuracyGuess(k,i) = sum(random_pred == Ytest)/length(Ytest);
meanGuess(i) = mean(accuracyGuess(:,i));
accuracy = sum(Ypred == Ytest)/length(Ypred);
%T test comparing model accuracy to random guess accuracy
[sigDiff(i,1),sigDiff(i,2)] = ttest2(accuracy,accuracyGuess(:,i));
meanGuessError = meanGuess(i) - accuracy;
regressionModel.logist.confusion{i} = confusionmat(Ytest,Ypred);
regressionModel.logist.accuracy(i) = accuracy;
regressionModel.logist.precision(i) = sum(Ypred ~=0 & Ypred==Ytest)/sum(Ypred~=0);
regressionModel.logist.recall(i) = sum(Ypred ~=0 & Ytest ==Ypred)/sum(Ytest~=0);
regressionModel.logist.absError(i) = mean(abs(Ypred - Ytest));
[regressionModel.logist.corrR(i),regressionModel.logist.corrP(i)] = corr(Ypred,Ytest);
[regressionModel.logist.rankCorrR(i),regressionModel.logist.rankCorrP(i)] = corr(Ypred,Ytest,'Type','Spearman');
regressionModel.logist.meanGuessError(i) = meanGuessError;
regressionModel.logist.compareGuess(i) = sigDiff(i,2);
end
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 2.164030e-16.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 7.961029e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 6.517939e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 5.336438e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 4.369106e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.577122e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.436861e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.302100e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.172622e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.147343e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.122264e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.117273e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.112289e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.107313e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.102346e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.101353e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.101155e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100956e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100758e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100559e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100520e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100520e-17.
logVarNames = ["Accuracy","Precision","Recall","Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val","Mean Guess Error","Guess Difference"];
table(regressionModel.logist.accuracy',regressionModel.logist.precision',regressionModel.logist.recall',...
regressionModel.logist.absError',regressionModel.logist.corrR',regressionModel.logist.corrP',...
regressionModel.logist.rankCorrR',regressionModel.logist.rankCorrP', regressionModel.logist.meanGuessError',...
regressionModel.logist.compareGuess','VariableNames',logVarNames,'RowNames', rowNames)
ans = 5×10 table
| | Accuracy | Precision | Recall | Average Absolute Error | Pearsons Correlation | Pearsons Correlation P-val | Rank Correlation | Rank Correlation P-val | Mean Guess Error | Guess Difference |
|---|
| 1 Subsample 1 | 0.7143 | 0.6818 | 0.6250 | 0.2857 | 0.4116 | 0.0016 | 0.4116 | 0.0016 | -0.2414 | 0.0005 |
|---|
| 2 Subsample 2 | 0.7636 | 0.7273 | 0.6957 | 0.2364 | 0.5116 | 0.0001 | 0.5116 | 0.0001 | -0.2829 | 0.0000 |
|---|
| 3 Subsample 3 | 0.8182 | 0.6842 | 0.7647 | 0.1818 | 0.5897 | 0.0000 | 0.5897 | 0.0000 | -0.4120 | 0.0000 |
|---|
| 4 Subsample 4 | 0.7455 | 0.7143 | 0.6522 | 0.2545 | 0.4718 | 0.0003 | 0.4718 | 0.0003 | -0.2698 | 0.0000 |
|---|
| 5 Subsample 5 | 0.7091 | 0.7500 | 0.5000 | 0.2909 | 0.4051 | 0.0022 | 0.4051 | 0.0022 | -0.2316 | 0.0012 |
|---|
set(f, 'Position', expandFigSize)
subplot(ceil(NFolds/2),floor(NFolds/2),i)
cm = confusionchart(regressionModel.logist.confusion{i});
cm.Title = sprintf("Subsample %d",i);
sgtitle("Status Classification Using Logistic Regression")
tempT = table([regressionModel.logist.bestFeaturesNames{i},...
num2cell(regressionModel.logist.bestFeaturesCoeff{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T
T = 8×5 table
| | Subset 1 | Subset 2 | Subset 3 | Subset 4 | Subset 5 |
|---|
| 1 | 'age' | 88.8792 | 'alkphos' | -7.2926 | 'age' | 22.5655 | 'alkphos' | -6.7341 | 'age' | 113.5512 |
|---|
| 2 | 'alkphos' | -3.9411 | 'age' | 79.1033 | 'alkphos' | -6.3837 | 'age' | 58.3636 | 'logpro' | -0.5337 |
|---|
| 3 | 'logpro' | -0.5764 | 'logpro' | -0.5537 | 'ast' | 0.2563 | 'copper' | -0.0025 | 'protime' | -0.0016 |
|---|
| 4 | 'ast' | 0.5772 | 'ast' | 0.7510 | 'spiders' | -0.0061 | 'sex' | 2.8046 | 'sex' | 5.4002 |
|---|
| 5 | 'protime' | -0.0020 | 'trig' | 0.1450 | 'albumin' | -0.0660 | 'logbil' | 24.4058 | 'trig' | 0.1485 |
|---|
| 6 | 'logbil' | 13.6599 | 'protime' | 0.0013 | 'trig' | 0.1241 | 'logpro' | -0.6428 | 'alkphos' | -6.4620 |
|---|
| 7 | 'copper' | -2.9115e-05 | 'sex' | 3.6249 | 'logalb' | -0.0027 | 'stage' | -0.4353 | 'hepato' | -0.3488 |
|---|
| 8 | 'stage' | 0.1487 | 'logbil' | 24.6619 | 'copper' | -0.0041 | 'trt' | -0.2967 | 'logbil' | 24.0589 |
|---|
% Extract train rows from X and Y
trainData = cirrhosis_log(train,:);
Xtrain = table2array(trainData(:,1:end-1));
Ytrain = table2array(trainData(:,end));
% Extract test rows from X and Y
testData = cirrhosis_log(test,:);
Xtest = table2array(testData(:,1:end-1));
Ytest = table2array(testData(:,end));
% Create linear model with train data.
rfMdlTime = TreeBagger(100,Xtrain,Ytrain,'Method','regression','OOBPrediction','On','OOBPredictorImportance','on');
Ypred = rfMdlTime.predict(Xtest);
regressionModel.rfMdlTime.fit{i} = rfMdlTime;
regressionModel.rfMdlTime.absError(i) = mean(abs(Ypred-Ytest));
[regressionModel.rfMdlTime.corrR(i),regressionModel.rfMdlTime.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.rfMdlTime.rankCorrR(i),regressionModel.rfMdlTime.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
set(f, 'Position', expandFigSize)
subplot(ceil(NFolds/2),floor(NFolds/2),i)
bar(regressionModel.rfMdlTime.fit{i}.OOBPermutedPredictorDeltaError)
ylabel('Out-of-Bag Feature Importance')
title(sprintf('Subsample %i',i))
xticks(1:length(cirrhosis_log_names)-1)
xticklabels(cirrhosis_log_names(1:end-1))
sgtitle('Out-of-Bag Feature Imp for Time Prediction')
rfVarNames = ["Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val"];
table(regressionModel.rfMdlTime.absError',regressionModel.rfMdlTime.corrR',...
regressionModel.rfMdlTime.corrP',regressionModel.rfMdlTime.rankCorrR',regressionModel.rfMdlTime.rankCorrP',...
'VariableNames', rfVarNames,'RowNames', rowNames)
ans = 5×5 table
| | Average Absolute Error | Pearsons Correlation | Pearsons Correlation P-val | Rank Correlation | Rank Correlation P-val |
|---|
| 1 Subsample 1 | 0.4367 | 0.6848 | 0.0000 | 0.5654 | 0.0000 |
|---|
| 2 Subsample 2 | 0.4987 | 0.8368 | 0.0000 | 0.7093 | 0.0000 |
|---|
| 3 Subsample 3 | 0.3632 | 0.7669 | 0.0000 | 0.6246 | 0.0000 |
|---|
| 4 Subsample 4 | 0.5234 | 0.5887 | 0.0000 | 0.5470 | 0.0000 |
|---|
| 5 Subsample 5 | 0.4535 | 0.3926 | 0.0030 | 0.3678 | 0.0060 |
|---|
accuracyGuess = zeros(100,NFolds);
meanGuess = zeros(NFolds,1);
sigDiff = zeros(NFolds,2);
% Extract train rows from X and Y
trainData = cirrhosis_status(train,:);
Xtrain = table2array(trainData(:,1:end-1));
Ytrain = table2array(trainData(:,end));
% Extract test rows from X and Y
testData = cirrhosis_status(test,:);
Xtest = table2array(testData(:,1:end-1));
Ytest = double(table2array(testData(:,end)));
% Create linear model with train data.
rfMdlStatus = TreeBagger(100,Xtrain,Ytrain,'OOBPrediction','On','OOBPredictorImportance','on');
Ypred = double(categorical(rfMdlStatus.predict(Xtest))) - 1;
%Comparing model to 100 random guesses
random_pred = Ytrain(randperm(length(Ytest)));
accuracyGuess(k,i) = sum(random_pred == Ytest)/length(Ytest);
meanGuess(i) = mean(accuracyGuess(:,i));
accuracy = sum(Ypred == Ytest)/length(Ypred);
%T test comparing model accuracy to random guess accuracy
[sigDiff(i,1),sigDiff(i,2)] = ttest2(accuracy,accuracyGuess(:,i));
meanGuessError = meanGuess(i) - accuracy;
regressionModel.rfMdlStatus.fit{i} = rfMdlStatus;
regressionModel.rfMdlStatus.confusion{i} = confusionmat(Ytest,Ypred);
regressionModel.rfMdlStatus.absError(i) = mean(abs(Ypred-Ytest));
regressionModel.rfMdlStatus.accuracy(i) = accuracy;
[regressionModel.rfMdlStatus.corrR(i),regressionModel.rfMdlStatus.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.rfMdlStatus.rankCorrR(i),regressionModel.rfMdlStatus.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
regressionModel.rfMdlStatus.meanGuessError(i) = meanGuessError;
regressionModel.rfMdlStatus.compareGuess(i) = sigDiff(i,2);
set(f, 'Position', expandFigSize)
subplot(ceil(NFolds/2),floor(NFolds/2),i)
bar(regressionModel.rfMdlStatus.fit{i}.OOBPermutedPredictorDeltaError)
ylabel('Out-of-Bag Feature Importance')
title(sprintf('Subsample %i',i))
xticks(1:length(cirrhosis_status_names)-1)
xticklabels(cirrhosis_status_names(1:end-1))
sgtitle('Out-of-Bag Feature Importance for Time Prediction')
rfClassVarNames = ["Average Absolute Error","Accuracy","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val","Mean Guess Error","Guess Difference"];
table(regressionModel.rfMdlStatus.absError',regressionModel.rfMdlStatus.accuracy',regressionModel.rfMdlStatus.corrR',...
regressionModel.rfMdlStatus.corrP',regressionModel.rfMdlStatus.rankCorrR',regressionModel.rfMdlStatus.rankCorrP',...
regressionModel.rfMdlStatus.meanGuessError',regressionModel.rfMdlStatus.compareGuess', ...
'VariableNames',rfClassVarNames,'RowNames', rowNames)
ans = 5×8 table
| | Average Absolute Error | Accuracy | Pearsons Correlation | Pearsons Correlation P-val | Rank Correlation | Rank Correlation P-val | Mean Guess Error | Guess Difference |
|---|
| 1 Subsample 1 | 0.2143 | 0.7857 | 0.5594 | 0.0000 | 0.5594 | 0.0000 | -0.3018 | 0.0000 |
|---|
| 2 Subsample 2 | 0.2000 | 0.8000 | 0.6000 | 0.0000 | 0.6000 | 0.0000 | -0.3335 | 0.0000 |
|---|
| 3 Subsample 3 | 0.1455 | 0.8545 | 0.6505 | 0.0000 | 0.6505 | 0.0000 | -0.4389 | 0.0000 |
|---|
| 4 Subsample 4 | 0.2909 | 0.7091 | 0.4022 | 0.0023 | 0.4022 | 0.0023 | -0.2465 | 0.0011 |
|---|
| 5 Subsample 5 | 0.2545 | 0.7455 | 0.4958 | 0.0001 | 0.4958 | 0.0001 | -0.2607 | 0.0000 |
|---|
set(f, 'Position', expandFigSize)
subplot(ceil(NFolds/2),floor(NFolds/2),i)
cm = confusionchart(regressionModel.rfMdlStatus.confusion{i});
cm.Title = sprintf("Subsample %d",i);
sgtitle("Status Classification Using Random Forest")
% Extract train rows from X and Y
trainData = cirrhosis_status(train,:);
Xtrain = table2array(trainData(:,1:end-1));
Ytrain = table2array(trainData(:,end));
% Extract test rows from X and Y
testData = cirrhosis_status(test,:);
Xtest = table2array(testData(:,1:end-1));
Ytest = double(table2array(testData(:,end)));
svmMdlStatus = fitcsvm(cirrhosis_status,'status');
Ypred = double(categorical(svmMdlStatus.predict(Xtest))) - 1;
%Comparing model to 100 random guesses
random_pred = Ytrain(randperm(length(Ytest)));
accuracyGuess(k,i) = sum(random_pred == Ytest)/length(Ytest);
meanGuess(i) = mean(accuracyGuess(:,i));
accuracy = sum(Ypred == Ytest)/length(Ypred);
%T test comparing model accuracy to random guess accuracy
[sigDiff(i,1),sigDiff(i,2)] = ttest2(accuracy,accuracyGuess(:,i));
meanGuessError = meanGuess(i) - accuracy;
regressionModel.svmMdlStatus.fit{i} = svmMdlStatus;
regressionModel.svmMdlStatus.confusion{i} = confusionmat(Ytest,Ypred);
regressionModel.svmMdlStatus.absError(i) = mean(abs(Ypred-Ytest));
regressionModel.svmMdlStatus.accuracy(i) = accuracy;
[regressionModel.svmMdlStatus.corrR(i),regressionModel.svmMdlStatus.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.svmMdlStatus.rankCorrR(i),regressionModel.svmMdlStatus.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
regressionModel.svmMdlStatus.meanGuessError(i) = meanGuessError;
regressionModel.svmMdlStatus.compareGuess(i) = sigDiff(i,2);
svmClassVarNames = ["Average Absolute Error","Accuracy","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val","Mean Guess Error","Guess Difference"];
table(regressionModel.svmMdlStatus.absError',regressionModel.svmMdlStatus.accuracy',regressionModel.svmMdlStatus.corrR',...
regressionModel.svmMdlStatus.corrP',regressionModel.svmMdlStatus.rankCorrR',regressionModel.svmMdlStatus.rankCorrP',...
regressionModel.svmMdlStatus.meanGuessError',regressionModel.svmMdlStatus.compareGuess', ...
'VariableNames',svmClassVarNames,'RowNames', rowNames)
ans = 5×8 table
| | Average Absolute Error | Accuracy | Pearsons Correlation | Pearsons Correlation P-val | Rank Correlation | Rank Correlation P-val | Mean Guess Error | Guess Difference |
|---|
| 1 Subsample 1 | 0.2500 | 0.7500 | 0.4841 | 0.0002 | 0.4841 | 0.0002 | -0.2814 | 1.4628e-06 |
|---|
| 2 Subsample 2 | 0.2182 | 0.7818 | 0.5568 | 0.0000 | 0.5568 | 0.0000 | -0.3113 | 1.9432e-06 |
|---|
| 3 Subsample 3 | 0.2364 | 0.7636 | 0.4220 | 0.0013 | 0.4220 | 0.0013 | -0.3556 | 2.1356e-10 |
|---|
| 4 Subsample 4 | 0.2727 | 0.7273 | 0.4309 | 0.0010 | 0.4309 | 0.0010 | -0.2527 | 1.7325e-04 |
|---|
| 5 Subsample 5 | 0.2364 | 0.7636 | 0.5222 | 0.0000 | 0.5222 | 0.0000 | -0.2735 | 1.1850e-04 |
|---|
set(f, 'Position', expandFigSize)
subplot(ceil(NFolds/2),floor(NFolds/2),i)
cm = confusionchart(regressionModel.svmMdlStatus.confusion{i});
cm.Title = sprintf("Subsample %d",i);
sgtitle("Status Classification Using Support Vector Machine")
compVarNames = ["Logistic","Random Forest", "Support Vector Machine (SVM)"];
compRowNames = ["Pearsons Correlation","Mean Absolute Error","Accuracy"];
mean(regressionModel.logist.corrR), mean(regressionModel.rfMdlStatus.corrR), mean(regressionModel.svmMdlStatus.corrR)
mean(regressionModel.logist.absError), mean(regressionModel.rfMdlStatus.absError), mean(regressionModel.svmMdlStatus.absError)
mean(regressionModel.logist.accuracy), mean(regressionModel.rfMdlStatus.accuracy), mean(regressionModel.svmMdlStatus.accuracy)
table(compData(:,1),compData(:,2), compData(:,3),...
'VariableNames', compVarNames,'RowNames', compRowNames)
ans = 3×3 table
| | Logistic | Random Forest | Support Vector Machine (SVM) |
|---|
| 1 Pearsons Correlation | 0.4780 | 0.5416 | 0.4832 |
|---|
| 2 Mean Absolute Error | 0.2499 | 0.2210 | 0.2427 |
|---|
| 3 Accuracy | 0.7501 | 0.7790 | 0.7573 |
|---|
set(f, 'Position', [0,0,1000,500])
confusionchart(round(mean(cat(3,regressionModel.rfMdlStatus.confusion{:}),3)));
title("Random Forest Status Classification")
confusionchart(round(mean(cat(3,regressionModel.logist.confusion{:}),3)));
title("Logistic Regression Status Classification")
confusionchart(round(mean(cat(3,regressionModel.svmMdlStatus.confusion{:}),3)));
title("Support Vector Machine Status Classification")